Load required packages and read in combined data.

#packages
pacman::p_load(dplyr, 
               tidyr, 
               ggplot2, 
               rjson,
               rdatacite,
               cowplot, 
               stringr, 
               knitr, 
               DT)


#Load the combined data from 3_Combined_data.R
load(file="data_rdata_files/Combined_ALL_data.Rdata")

#subset the data to published years >= 2012
all_dois <- combined_dois %>% 
  filter(publicationYear >= 2012) 

All Metadata combined

Look at dois by their origin

all_dois %>% 
  group_by(group) %>% 
  summarize(count=n()) %>% 
  kable()
group count
Affiliation - CrossRef 147702
Affiliation - Datacite 51053
IR_publisher 24104

General data cleaning

#DRUM is inconsistently specified (with and without DRUM)
all_dois$publisher[grep("Data Repository for the University of Minnesota", all_dois$publisher)] <- "Data Repository for the University of Minnesota (DRUM)"


#Remove morphosource data, as affiliation isn't included
all_dois2 <- all_dois[-which(all_dois$publisher_plus == "Duke-MorphoSource Media"),]

#make sure dataset is capitalized in all metadata resource types
all_dois2[which(all_dois2$resourceTypeGeneral == "dataset"),]$resourceTypeGeneral <- "Dataset"

Collapse IRs into a single category

Look at all the Institutional Repositories Captured

IR_pubs <- all_dois2 %>% 
  filter(group == "IR_publisher") %>% 
  group_by(publisher_plus) %>% 
  summarize(count = n()) 

IR_pubs %>% 
  kable(col.names = c("Institutional Repository", "Count"))
Institutional Repository Count
Cornell 4758
Duke-Duke Digital Repository 76
Duke-Research Data Repository, Duke University 147
Michigan 10
Michigan-Deep Blue 637
Michigan-ICPSR/ISR 109
Michigan-Other 57
Minnesota 692
Virginia Tech 333
Washington U 4085

Replace all of these publishers with “Institutional Repository” so that they will be represented in a single bar.

all_dois2$publisher[which(all_dois2$publisher_plus %in% unique(IR_pubs$publisher_plus))] <- "Institutional Repository"

#catch the rest of the "Cornell University Library"
all_dois2$publisher[which(all_dois2$publisher == "Cornell University Library")] <- "Institutional Repository"

#and stray VT
all_dois2$publisher[which(all_dois2$publisher == "University Libraries, Virginia Tech")] <- "Institutional Repository"

#and DRUM
all_dois2$publisher[which(all_dois2$publisher == "Data Repository for the University of Minnesota (DRUM)")] <- "Institutional Repository"

##ICPSR is also inconsistent
all_dois2$publisher[grep("Consortium for Political", all_dois$publisher)] <- "ICPSR"

Overall counts by publisher

Counts by publisher

by_publisher <- all_dois2 %>% 
  group_by(institution, publisher) %>% 
  summarize(count=n()) %>% 
  arrange(institution, desc(count))

by_publisher %>% 
  datatable()

Counts by resource type

by_resource <- all_dois2 %>% 
  group_by(institution, resourceTypeGeneral) %>% 
  summarize(count=n()) %>% 
  arrange(institution, desc(count)) 

Create a table of top resources

by_resource_table <-  by_resource %>% 
  #filter(resourceTypeGeneral %in% c("Dataset", "Software", "Text", "Image")) %>% 
  pivot_wider(names_from = institution, 
              values_from = count, 
              values_fill = 0) %>%  
  rowwise %>% 
  mutate(Total = sum(c_across(Cornell:`Washington U`))) %>% 
  arrange(desc(Total))

by_resource_table %>% 
  datatable

Write out the resources

write.csv(by_resource_table, file = "data_summary_data/Counts of Resource Types by Insitution.csv", row.names = F)

Data specific DOIs

Subset to only datasets

data_dois <- all_dois2 %>% 
  filter(resourceTypeGeneral == "Dataset") 

Data DOIs by publisher

by_publisher_data <- data_dois %>% 
  group_by(publisher, institution) %>% 
  summarize(count=n()) %>% 
  arrange(institution, desc(count))


by_publisher_data_table <- by_publisher_data %>% 
  pivot_wider(names_from = institution, 
              values_from = count, 
              values_fill = 0) %>% 
  rowwise %>% 
  mutate(Total = sum(c_across(Cornell:`Washington U`))) %>% 
  arrange(desc(Total))

by_publisher_data_table %>% 
  datatable()

Write out the table of data publishers

write.csv(by_publisher_data_table, file="data_summary_data/Counts of Data Publishers By Insitituion.csv", row.names = F)

Software specific DOIs

Subset to only software (only datacite has software)

software_dois <- all_dois2 %>% 
  filter(resourceTypeGeneral == "Software")
by_publisher_software <- software_dois %>% 
  group_by(publisher, institution) %>% 
  summarize(count=n()) %>% 
  arrange(institution, desc(count))

by_publisher_software_table <- by_publisher_software %>% 
  pivot_wider(names_from = institution, 
              values_from = count, 
              values_fill = 0) %>% 
  rowwise %>% 
  mutate(Total = sum(c_across(Cornell:`Washington U`))) %>% 
  arrange(desc(Total))

by_publisher_software_table %>% 
  datatable

Write out the table of software publishers

write.csv(by_publisher_software_table, file="data_summary_data/Counts of Software Publishers By Insitituion.csv", row.names = F)

Graphs

Top publishers - Data DOIs

Plot publishers by rank, ordered from most DOIs to least (take top 20)

by_publisher_data %>% 
  group_by(publisher) %>% 
  summarize(count=sum(count)) %>% 
  arrange(desc(count)) %>% 
  mutate(pubrank = order(count, decreasing = T)) %>% 
  ggplot(aes(x=pubrank, y=count)) +
  geom_bar(stat="identity") +
  scale_x_continuous(limits = c(0,20), n.breaks = 20) +
  labs(x = "Publisher Rank", y="Number of DOIs", title="Number of DOIs by top Publishers")+
  coord_cartesian(xlim = c(1,20)) +
  theme_bw() 

If we look at the top 9 publishers for the combined data (both DataCite and CrossRef), how many DOIs does this cover?

top9pubs <- by_publisher_data_table$publisher[1:9]

by_publisher_data %>% 
  group_by(publisher) %>% 
  summarize(count=sum(count)) %>% 
  mutate(intop9pub = publisher %in% top9pubs) %>% 
  group_by(intop9pub) %>% 
  summarize(totalDOIs = sum(count), nrepos = n()) %>% 
  ungroup() %>% 
  mutate(propDOIs = totalDOIs/sum(totalDOIs)) %>% 
  kable(col.names =  c("In Top 9 Publishers", "Total N DOIs", "Total N Publishers", "Proportion of Total DOIs"))
In Top 9 Publishers Total N DOIs Total N Publishers Proportion of Total DOIs
FALSE 2274 159 0.0128642
TRUE 174496 9 0.9871358

Plotting Number of DOIs in the top 8 publishers by institution

top9colors <- c("Harvard Dataverse" = "dodgerblue2",
                "Zenodo" = "darkorange1",
                "ICPSR" = "darkcyan",
                "Dryad" = "lightgray", 
                "Qualitative Data Repository" = "gold1",
                "figshare" = "purple", 
                "ENCODE Data Coordination Center" = "red", 
                "Faculty Opinions Ltd" = "darkgreen", 
                "Institutional Repository" = "lightblue")


(by_publisher_data_plot <-  by_publisher_data %>% 
    filter(publisher %in% top9pubs) %>% 
    ggplot(aes(x=institution, y=count, fill=publisher)) +
    geom_bar(stat="identity", position=position_dodge(preserve = "single")) +
    scale_fill_manual(values = top9colors, name="Publisher")+
    guides(fill = guide_legend(title.position = "top")) +
    scale_y_continuous(breaks = seq(from = 0, to=5000, by=500)) +
    coord_cartesian(ylim = c(0,5000)) +
    labs(x = "Institution", y="Count of Data DOIs - CrossRef & DataCite", caption = "Note: Michigan Dataverse & ENCODEbar cutoff for scaling") +
    theme_bw() +
    theme(legend.position = "bottom", legend.title.align = .5))

ggsave(by_publisher_data_plot, filename = "figures/Counts of Data DOIs by Institution - CrossRef and DataCite.png", device = "png",  width = 8, height = 6, units="in")

Institutional Graphs

Cornell

Duke

Michigan

Minnesota

Virginia Tech

Wash U

Top publishers - Software DOIs

Look at the top software publishers (This excludes CrossRef affiliation data, as software is not a resource type).

top6pubs_soft <- by_publisher_software_table$publisher[1:6]

top6colors_soft <- c("Zenodo" = "darkorange1",
                     "Code Ocean" = "darkblue",
                     "Institutional Repository" = "lightblue",
                     "Optica Publishing Group" = "red", 
                     "CoMSES Net" = "pink", 
                     "figshare" = "purple")


(by_publisher_software_plot <-  by_publisher_software %>% 
    filter(publisher %in% top6pubs_soft) %>% 
    ggplot(aes(x=institution, y=count, fill=publisher)) +
    geom_bar(stat="identity", position=position_dodge(preserve = "single")) +
    scale_fill_manual(values = top6colors_soft, name="Publisher")+
    guides(fill = guide_legend(title.position = "top")) +
    labs(x = "Institution", y="Count of Software DOIs") +
    theme_bw() +
    theme(legend.position = "bottom", legend.title.align = .5))

ggsave(by_publisher_software_plot, filename = "figures/Counts of Software DOIs by Institution.png", device = "png",  width = 8, height = 6, units="in")

Institutional Graphs - Software

Cornell

Duke

Michigan

Minnesota

Virginia Tech

Wash U

Datacite affiliation + IR only

Subset to remove the CrossRef affiliation data from data DOIs

by_publisher_data_dc <- data_dois %>% 
  filter(group != "Affiliation - CrossRef") %>% 
  group_by(publisher, institution) %>% 
  summarize(count=n()) %>% 
  arrange(institution, desc(count))

by_publisher_data_dc_table <- by_publisher_data_dc %>% 
  pivot_wider(names_from = institution, 
              values_from = count, 
              values_fill = 0) %>% 
  rowwise %>% 
  mutate(Total = sum(c_across(Cornell:`Washington U`))) %>% 
  arrange(desc(Total))

by_publisher_data_dc_table %>% 
  datatable()

Overall counts by publisher

Look at publishers by rank of data DOIs

#publisher plots
by_publisher_data_dc %>% 
  group_by(publisher) %>% 
  summarize(count=sum(count)) %>% 
  arrange(desc(count)) %>% 
  mutate(pubrank = order(count, decreasing = T)) %>% 
  ggplot(aes(x=pubrank, y=count)) +
  geom_bar(stat="identity") +
  scale_x_continuous(limits = c(0,20), n.breaks = 20) +
  coord_cartesian(xlim = c(1,20)) +
  labs(x = "Publisher Rank", y="Number of DOIs", title="Number of DOIs by top Publishers") +
  theme_bw() 

Look at 7 publishers here. How many DOIs does this capture?

top7pubs <- by_publisher_data_dc_table$publisher[1:7]

by_publisher_data_dc %>% 
  group_by(publisher) %>% 
  summarize(count=sum(count)) %>% 
  mutate(intop7pub = publisher %in% top7pubs) %>% 
  group_by(intop7pub) %>% 
  summarize(totalDOIs = sum(count), nrepos = n()) %>% 
  ungroup() %>% 
  mutate(propDOIs = totalDOIs/sum(totalDOIs))
## # A tibble: 2 × 4
##   intop7pub totalDOIs nrepos propDOIs
##   <lgl>         <int>  <int>    <dbl>
## 1 FALSE          2168    153   0.0746
## 2 TRUE          26900      7   0.925

Graphs

Top 7 publishers

top7colors <- c("Harvard Dataverse" = "dodgerblue2",
                "Zenodo" = "darkorange1",
                "ICPSR" = "darkcyan",
                "Dryad" = "lightgray", 
                "Qualitative Data Repository" = "gold1",
                "figshare" = "purple", 
                "Institutional Repository" = "lightblue")


(by_publisher_data_plot <-  by_publisher_data_dc %>% 
    filter(publisher %in% top7pubs) %>% 
    ggplot(aes(x=institution, y=count, fill=publisher)) +
    geom_bar(stat="identity", position=position_dodge(preserve = "single")) +
    scale_fill_manual(values = top7colors, name="Publisher")+
    guides(fill = guide_legend(title.position = "top")) +
    scale_y_continuous(breaks = seq(from = 0, to=5000, by=500)) +
    coord_cartesian(ylim = c(0,5000)) +
    labs(x = "Institution", y="Count of Data DOIs", caption = "Note: Michigan Dataverse bar cutoff for scaling") +
    theme_bw() +
    theme(legend.position = "bottom", legend.title.align = .5))

ggsave(by_publisher_data_plot, filename = "figures/Counts of DataCite Data DOIs by Institution.png", device = "png",  width = 8, height = 6, units="in")

Institutional Graphs

Cornell

Duke

Michigan

Minnesota

Virginia Tech

Wash U

CrossRef affiliation only

Look at DOIs that came from the CrossRef search by affiliation.

by_crossref_publisher_data <- data_dois %>% 
  filter(group == "Affiliation - CrossRef") %>% 
  group_by(publisher, institution) %>% 
  summarize(count=n()) %>% 
  pivot_wider(names_from = institution, 
              values_from = count, 
              values_fill = 0) %>% 
  rowwise %>% 
  mutate(Total = sum(c_across(Cornell:`Washington U`))) %>% 
  arrange(desc(Total))

by_crossref_publisher_data %>% 
  datatable

Graphs

(by_crossref_publisher_data_plot <- data_dois %>% 
  filter(group == "Affiliation - CrossRef") %>% 
  group_by(publisher, institution) %>% 
  summarize(count=n()) %>% 
  ggplot(aes(x=institution, y=count, fill=publisher)) +
  geom_bar(stat="identity", position=position_dodge(preserve = "single")) +
  guides(fill = guide_legend(title.position = "top")) +
  scale_y_continuous(breaks = seq(from = 0, to=5000, by=500)) +
  coord_cartesian(ylim = c(0,5000)) +
  labs(x = "Institution", y="Count of Data DOIs", caption = "Note: Michigan ENCODE bar cutoff for scaling") +
  theme_bw() +
  theme(legend.position = "bottom", legend.title.align = .5))

ggsave(by_crossref_publisher_data_plot, filename = "figures/Counts of CrossRef Data DOIs by Institution.png", device = "png",  width = 10, height = 6, units="in")

Collapsed DOIs

Some repositories (such as Harvard’s Dataverse and Qualitative Data Repository) assign DOIs at the level of the file, rather than the study. Similarly, Zenodo often has many related DOIs for multiple figures within a study. In order to attempt to compare study-to-study counts of data sharing, look at the DOIs collapsed by “container”.

by_container <- 
all_dois2 %>% 
  filter(!is.na(container_identifier)) %>% 
  group_by(container_identifier, publisher, title, institution) %>% 
  summarize(count=n()) %>% 
  arrange(desc(count))

How many publishers have container DOIs?

by_container %>% 
  group_by(publisher) %>% 
  summarize(count=n()) %>% 
  arrange(desc(count)) %>% 
  datatable

Collapsing by container for counts

containerdups <- which(!is.na(all_dois2$container_identifier) & duplicated(all_dois2$container_identifier))

all_dois_collapsed <- all_dois2[-containerdups,]

Overall Count of Data DOIs

data_dois_collapse <- all_dois_collapsed %>% 
  filter(resourceTypeGeneral == "Dataset")

by_publisher_data_collapse <- data_dois_collapse %>% 
  group_by(publisher, institution) %>% 
  summarize(count=n()) %>% 
  arrange(institution, desc(count))

Table of publisher counts

by_publisher_data_collapse_table <- by_publisher_data_collapse %>% 
  pivot_wider(names_from = institution, 
              values_from = count, 
              values_fill = 0) %>% 
  rowwise %>% 
  mutate(Total = sum(c_across(Cornell:`Washington U`))) %>% 
  arrange(desc(Total))

by_publisher_data_collapse_table %>% 
  datatable

Write out the table of data publishers

write.csv(by_publisher_data_collapse_table, file="data_summary_data/Counts of Data Publishers By Insitituion - Collapsed by container.csv", row.names = F)

Graphs

For these graphs, will remove ENCODE and Faculty Opinions Ltd from visualization (CrossRef affiliations).

Top 7 publishers of data dois

by_publisher_data_dc_collapse <- data_dois_collapse %>% 
  filter(group != "Affiliation - CrossRef") %>% 
  group_by(publisher, institution) %>% 
  summarize(count=n()) %>% 
  arrange(institution, desc(count))

#table of  publishers - data
by_publisher_data_dc_collapse_table <- by_publisher_data_dc_collapse %>% 
  pivot_wider(names_from = institution, 
              values_from = count, 
              values_fill = 0) %>% 
  rowwise %>% 
  mutate(Total = sum(c_across(Cornell:`Washington U`))) %>% 
  arrange(desc(Total))

Look at publishers based on rank of number of DOIs

by_publisher_data_dc_collapse_table %>% 
  group_by(publisher) %>% 
  summarize(count=sum(Total)) %>% 
  arrange(desc(count)) %>% 
  mutate(pubrank = order(count, decreasing = T)) %>% 
  ggplot(aes(x=pubrank, y=count)) +
  geom_bar(stat="identity") +
  scale_x_continuous(limits = c(0,25)) +
  labs(x = "Publisher Rank", y="Number of DOIs", title="Number of DOIs by top Publishers")+
  theme_bw() 

Look at the top 7 publishers - how many does this capture?

top7pubs <- by_publisher_data_dc_collapse_table$publisher[1:7]

by_publisher_data_dc_collapse_table %>% 
  group_by(publisher) %>% 
  summarize(count=sum(Total)) %>% 
  mutate(intop7pub = publisher %in% top7pubs) %>% 
  group_by(intop7pub) %>% 
  summarize(totalDOIs = sum(count), nrepos = n()) %>% 
  ungroup() %>% 
  mutate(propDOIs = totalDOIs/sum(totalDOIs))
## # A tibble: 2 × 4
##   intop7pub totalDOIs nrepos propDOIs
##   <lgl>         <int>  <int>    <dbl>
## 1 FALSE          1441    153    0.105
## 2 TRUE          12228      7    0.895
top7colors <- c("Harvard Dataverse" = "dodgerblue2",
                "Zenodo" = "darkorange1",
                "ICPSR" = "darkcyan",
                "Dryad" = "lightgray", 
                "figshare" = "purple", 
                "Institutional Repository" = "lightblue", 
                "Taylor & Francis" = "gold2")



(by_publisher_data_plot_collapse <-  by_publisher_data_dc_collapse %>% 
    filter(publisher %in% top7pubs) %>% 
    ggplot(aes(x=institution, y=count, fill=publisher)) +
    geom_bar(stat="identity", position=position_dodge(preserve = "single")) +
    scale_fill_manual(values = top7colors, name="Publisher")+
    guides(fill = guide_legend(title.position = "top")) +
    #scale_y_continuous(breaks = seq(from = 0, to=5000, by=500)) +
    #coord_cartesian(ylim = c(0,5000)) +
    labs(x = "Institution", y="Count of Data DOIs", caption = "Note: Michigan Dataverse bar cutoff for scaling") +
    theme_bw() +
    theme(legend.position = "bottom", legend.title.align = .5))

ggsave(by_publisher_data_plot_collapse, filename = "figures/Counts of DataCite Data DOIs by Institution_DOIcollapsed.png", device = "png",  width = 8, height = 6, units="in")

Institutional Graphs - Collapsed

Cornell

Duke

Michigan

Minnesota

Virginia Tech

Wash U

Write out institutional data

Write out CSV files for each institution:

  • All DOIs
  • All DOIs collapsed
for (i in unique(all_dois2$institution)) {
  all_dois %>% 
    filter(institution == i) %>% 
    write.csv(file=paste0("data_all_dois/All_dois_", i, gsub("-", "", Sys.Date()), ".csv"), row.names = F)
  
  all_dois_collapsed %>% 
    filter(institution == i) %>% 
    write.csv(file=paste0("data_all_dois/All_dois_collapsed_", i, gsub("-", "", Sys.Date()), ".csv"), row.names = F)
}